#########
# SETUP #
#########

# Program is set to work in a folder containing a directory with all of the data files needed to run the program
# Directories containing the realvent data are
#
#    - ALL_all_evo_data
#       * contains results files for all CEACAM sequences
#       * CEACAM5 data is for CEACAM5 sequences trimmed using strategy 1 and aligned by MAFFT
#    - CCM5_mafft_ds2_all_evo_data 
#       * contains results files for CEACAM5 sequences trimmed using strategy 2 and aligned by MAFFT
#    - CCM5_mafft_ds3_all_evo_data
#       * contains results files for CEACAM5 sequences trimmed using strategy 3 and aligned by MAFFT
#    - CCM5_mus_ds4_all_evo_data
#       * contains results files for CEACAM5 sequences trimmed using strategy 1 and aligned by MUSCLE
#    - CCM5_mus_ds5_all_evo_data
#       * contains results files for CEACAM5 sequences trimmed using strategy 2 and aligned by MUSCLE
#    - CCM5_mus_ds6_all_evo_data
#       * contains results files for CEACAM5 sequences trimmed using strategy 3 and aligned by MUSCLE
#


# *** CHECK BEFORE RUNNING ***
#   Domains List & GARD sites list
#       - make sure only the relavent CEACAM5 domain structure and predicted GARD sites is uncommented
#
#   Directory name
#       - depending on which set of files is to be run the relevant directory should be named "all_evo_data/"


#required packages

library(evobiR)
library(ggplot2)
library(ggforce)

######################################################################################
# Lists of CEACAM Domains and GARD recombination sites to be used to annotate graphs #
######################################################################################

#Domains List

domain_list <- list()

domain_list[["CCM01" ]]<- list(c("N", 1, 107), c("I", 109, 200), c("I", 203, 284), c("I", 289, 373), c("T", 387, 410), c("C", 411, 480))
domain_list[["CCM03"]] <- list(c("N", 2, 105), c("I", 108, 198), c("I", 199, 273), c("T", 290, 316), c("C", 317, 392))
domain_list[["CCM04"]] <- list(c("N", 13, 117), c("T", 129, 153), c("C", 154, 221))
#domain assignment for CCM05_mafft_ds1 - 
domain_list[["CCM05"]] <- list(c("N", 2, 103), c("I", 107, 200), c("I", 202, 283), c("I", 284, 363), c("I", 375, 458), c("I", 459, 547), c("I", 548, 624), c("T", 630, 654))
#domain assignment for CCM05_mafft_ds2 - domain_list[["CCM05"]] <- list(c("N", 2, 103), c("I", 107, 200), c("I", 201, 283), c("I", 284, 363), c("I", 375, 458), c("T", 457, 478))
#domain assignment for CCM05_mafft_ds3 - domain_list[["CCM05"]] <- list(c("N", 2, 103), c("I", 107, 200), c("I", 202, 285))
#domain assignment for CCM05_mus_ds4 - domain_list[["CCM05"]] <- list(c("N", 2, 101), c("I", 105, 198), c("I", 199, 281), c("I", 282, 361), c("I", 373, 446), c("I", 456, 546), c("I", 547, 622), c("T", 628, 652))
#domain assignment for CCM05_mus_ds5 - domain_list[["CCM05"]] <- list(c("N", 2, 101), c("I", 105, 196), c("I", 197, 278), c("I", 282, 374), c("I", 379, 449), c("T", 454, 478))
#domain assignment for CCM05_mus_ds6 - domain_list[["CCM05"]] <- list(c("N", 2, 101), c("I", 105, 194), c("I", 195, 272), c("T", 278, 302))
domain_list[["CCM06"]] <- list(c("N", 2, 107), c("I", 110, 199), c("I", 203, 284), c("T", 290, 307))
domain_list[["CCM07"]] <- list(c("N", 4, 106), c("I", 109, 199), c("T", 208, 226))
domain_list[["CCM08"]] <- list(c("N", 1, 104), c("I", 108, 197), c("I", 204, 276))
domain_list[["CCM16"]] <- list(c("N", 4, 106), c("I", 112, 201), c("I", 220, 291), c("I", 300, 401))
domain_list[["CCM18"]] <- list(c("N", 22, 101), c("I", 133, 196), c("I", 212, 282), c("T", 303, 327), c("C",328, 338))
domain_list[["CCM19"]] <- list(c("N", 3, 108), c("T", 126, 150), c("C", 151, 267))
domain_list[["CCM20"]] <- list(c("N", 94, 184), c("I", 202, 275), c("I", 291, 372), c("I", 386, 461), c("T", 473, 496), c("C", 497, 628))
domain_list[["CCM21"]] <- list(c("N", 66, 171), c("I", 194, 276), c("T", 282, 304), c("C", 305, 336))

#GARD sites list

GARD_sites_list <- list()

GARD_sites_list[["CCM01"]] <- c(306, 598)
GARD_sites_list[["CCM03"]] <- c(362, 735)
#CCM5_mafft_ds1 - 
GARD_sites_list[["CCM05"]] <- c(283, 445, 592, 842)
#CCM5_mafft_ds2 - GARD_sites_list[["CCM05"]] <- c(266, 434, 592, 746)
#CCM5_mafft_ds3 - GARD_sites_list[["CCM05"]] <- c(301, 491)
#CCM5_mus_ds4 - GARD_sites_list[["CCM05"]] <- c(326, 435, 522, 702, 836)
#CCM_mus_ds5 - GARD_sites_list[["CCM05"]] <- c(266, 439, 552)
#CCM5_mus_ds6 - GARD_sites_list[["CCM05"]] <- c(295, 524)
GARD_sites_list[["CCM06"]] <- c(456)
GARD_sites_list[["CCM07"]] <- c(306)

##############################################################################################
# Functions to extract sites determined to be under selection & their probabilty values from #
# evolutionary analyses results files                                                        #
##############################################################################################

#For extract_MEME & extract_FUBAR
#files named "CCM##_(FUBAR/MEME/GARD_MEME).csv")

extract_MEME <- function(file_name){
    dataset <- read.csv(file_name)
    dataset <- as.data.frame(cbind(dataset$Codon, dataset$p.value))
    colnames(dataset) <- c("Codon", "p.value")
    dataset <- dataset[which(dataset$p.value <= 0.05),]
    return(dataset)
}

extract_FUBAR <- function(file_name){
    dataset <- read.csv(file_name)
    colnames(dataset)[1] <- "Site"
    dataset <- as.data.frame(cbind(dataset$Site, dataset$Prob..alpha...beta...1))
    colnames(dataset) <- c("Codon", "Prob")
    dataset <- dataset[which(dataset$Prob >= 0.9),]
    return(dataset)
}
#To get the Bayes Emiperical Bayes sites under selection for PAML NS sites files 
#files named "CCM##_NS_extracted_sites.txt") and generated using the jupyter and python files listed below
#PAML_NS_sites_data_get_v14.ipynb; Extract_NS_sites_v31.py; PAML_NS_files.txt
#version uptodate as of 6-23-20
extract_NS <- function(file_name){
    dataset <- read.delim(file_name)
    pep_len <<- dataset[1,1]
    dataset <- dataset[,2:ncol(dataset)]
    dataset <- dataset[which(dataset$Analysis == "M2_BE_Bayes" & dataset$pvalue >= 0.95),]
    dataset <- dataset[,1:2]
    colnames(dataset) <- c("Codon", "Prob")
    return(dataset)
}

#Function to extract just the sites underselection for each evo test

get_codon_column <- function(x){
    return(x[,1])
}

# Function to create a new dataframe that for every site in the protein indicates
# if it was found to be under positive selection (0-no, >0 yes) by a evo test and 
# how many analyses called a particualar site as eveolving rapidly.
# for sliding window analysis

get_all_sites <- function(Peptide, Pos_Sites_Table) {
    protein_pos_table <- NULL
    for (number in seq(1,Peptide, 1)) { 
    pos_selection = FALSE;
    for (site in seq(1,length(Pos_Sites_Table[,1]))){ 
        if(number == Pos_Sites_Table[,1][site]){
            residue <- cbind(number,Pos_Sites_Table[,2][site]);
            protein_pos_table <- rbind(protein_pos_table, residue);
            pos_selection = TRUE
            }
        }
    if (pos_selection == FALSE){
        residue <- cbind(number,0);
        protein_pos_table <- rbind(protein_pos_table, residue)
    }
            
            }
    colnames(protein_pos_table) <- c("site", "Freq")
    protein_pos_table <- as.data.frame(protein_pos_table)
    return(protein_pos_table)
}

#Function to add rectangles representing each of the domains of a 
#given CEACAM to the graph showing sites under selection 

add_shape <- function(list_of_domains) {
    build_geom_cmd <- NULL
    for(item in list_of_domains){
        
        start <- as.numeric(as.character(item[2]))
        end <- as.numeric(as.character(item[3]))
        
        shape <<- data.frame(x = c(start,end,end,start), y=c(-0.1,-0.1,-0.4,-0.4))

        inner_fill <- ""
        if(item[1] == "N"){
            inner_fill <- "#c04847"
        }
        if(item[1]== "I"){
            inner_fill <- "#c0c5ce"
        }
        if(item[1] == "T"){
            inner_fill <- "#74818a"
        }
        if(item[1] == "C"){
            inner_fill <- "#242a31"
        }

        build_geom_cmd <-c(build_geom_cmd, geom_shape(data = shape, aes(x=x, y=y), fill = inner_fill,  radius = unit(5, "mm"))) 
    }
  return(build_geom_cmd)
  
}

#Functions to add GARD determined recombination breakpoints to graphs
#add_gard_sites1: adds filled triangle to the GARD site
#add_gard_sites2: adds an outline to the triangle

add_gard_sites1 <- function(list_of_GARD_sites) {
    build_geom_cmd <- NULL
    for(item in list_of_GARD_sites){
        codon = ceiling(item/3)
        shape2 <<- data.frame(x = c(codon, codon-2, codon+2), y = c(-0.2, -0.1, -0.1))
        build_geom_cmd <-c(build_geom_cmd, geom_polygon(data = shape2, aes(x=x, y=y), fill = "black", size = 1))
    }
    return(build_geom_cmd)
}

add_gard_sites2 <- function(list_of_GARD_sites) {
    build_geom_cmd <- NULL
    for(item in list_of_GARD_sites){
        codon = ceiling(item/3)
        shape2 <<- data.frame(x = c(codon, codon-5, codon+5), y = c(-0.11, 0.15, 0.15))
        build_geom_cmd <-c(build_geom_cmd, geom_polygon(data = shape2, aes(x=x, y=y), color = "#00aeef", fill = "#00aeef", size = .75))
    }
    #e48683 - red
    #7aadd9 - blue
    #8ccd8c - green
    #c1c1c1 - gray
    return(build_geom_cmd)
}

########################################################################################
# Goes through all of the evo test files by CCM and ...                                #
# - extracts sites likely to be under selection                                        #
# - determines which sites were picked out as evolving rapidly at least once, twice    #
#   or by all three tests                                                              #
# - determines the over lap in sites under selection for all combinations of evo tests #
# - for CEACAMs with GARD determined breakpoints outputs the MEME results for both     #
#   the analyses with and without GARD breakpoints                                     #
# - includes a column for manual input of GARD breakpoints                             #
# - outputs the data into a dataframe that is saved to a tab delim txt file            #
########################################################################################

#Extracts the CCM names from the files that will be analyzed with this code to create
#a list that can be used to iterativly go through files

CCMs <- unique(sapply(strsplit(list.files("all_evo_data/"), "_"), "[[", 1))

# Will be final output dataframe 
All_Sites_Summary <- c()

for(i in CCMs) {
    file_list <- list()
    file_name_list <- list()
    
    # for a given CCM extracts the data for each evo test file and inputs it into the file_list and uses a file_name_list to index which test is which in the file_list  
    for(x in list.files(path = "all_evo_data/", pattern = i)){
        
        evo_data <- strsplit(strsplit(x, ".csv")[[1]], "_")[[1]][2];
        locale <- paste("all_evo_data/",x, sep = "")
        
        if (evo_data == "FUBAR") {
            FUBAR_data <- extract_FUBAR(locale)
            file_list[[length(file_list) + 1]] <- FUBAR_data
            file_name_list <- c(file_name_list, "FUBAR")
        }
        if (evo_data == "MEME") {
            MEME_data <- extract_MEME(locale)
            file_list[[length(file_list) + 1]] <- MEME_data
            file_name_list <- c(file_name_list, "MEME")
        }
        if(evo_data == "GARD") {
            GARD_data <- extract_MEME(locale)
            file_list[[length(file_list) + 1]] <- GARD_data
            file_name_list <- c(file_name_list, "GARD")
        }
        if(evo_data == "NS"){
            NS_data <- extract_NS(locale)
            file_list[[length(file_list) + 1]] <- NS_data
            file_name_list <- c(file_name_list, "NS")
        }
    }
    #Creates a list of the vectors of the sites underselection for each of the evo tests
    ;temp_var <- lapply(file_list, get_codon_column)
    
    #in the case that a GARD_MEME file is present removes the MEME alone sites the temp_var
    #makes MEME data into a seperate object
    ;if ("GARD" %in% file_name_list == TRUE){
        ;no_GARD_MEME <- temp_var[match("MEME", file_name_list)]
        ;temp_var <- temp_var[-match("MEME", file_name_list)]
        ;file_name_list <- file_name_list[-match("MEME", file_name_list)]
        ;Gard_col <- "-"
    }
    else{
        no_GARD_MEME <- c()
        Gard_col <- "None"
    }
    # combines all of the sites found to be underselection in all of the evo tests being analyzed and puts them into a dataframe listing both the site and the number of tests (Freq) that ID'd that particular site 

    ;temp_var2 <- unlist(temp_var)
    ;temp_var2 <-  as.data.frame(table(temp_var2))
    
    ####################
    # Graphing Section #
    ####################
    
    ;selection_freq_table <- temp_var2
    ;colnames(selection_freq_table)[1] <- "sites"
    ;selection_freq_table$sites <- as.numeric(as.character(selection_freq_table$sites))

    #makes a dataframe for all of the sites in the protein and lists the number of times (0-3) 
    #a given site was called as being under selection by the different evo analyses
    
    ;sites_selection_df <- get_all_sites(pep_len, temp_var2)
    #simplifies the sliding window making it only sites under selection and not 
    #weighted by how many times a site was identified
    ;sites_selection_df[sites_selection_df$Freq > 0,] <-1

    # creates a dataframe of a 10 amino acid sliding window mean of the frequency
    # a site was found to be under positive selection by one of the Bayes Analyses

    pos_selection_slide <- SlidingWindow(mean, sites_selection_df$Freq, 10) 
    pos_selection_slide <- as.data.frame(pos_selection_slide)
    pos_selection_slide <- cbind(seq(1,nrow(pos_selection_slide)), pos_selection_slide)
    colnames(pos_selection_slide) <-c("sites", "Freq")


    #Can plot sites underselection and color code them based on how many evo test ID'd them
    #xlim chosen so that CCMs pictures would be scaled relative to their peptide length
    #choose 650 since it is slightly larger than the largest peptide sequence,
    #can change this value easily based on the peptide being used
    
    rounded_length <- strsplit(as.character(pep_len), "")[[1]][1]
    rounded_length <- c(rounded_length, "00")
    rounded_length <- as.numeric(paste(rounded_length, collapse = ""))

    breaks_I_want <- c(seq(0,rounded_length,100), pep_len)

    p2 <- ggplot(selection_freq_table, aes(x=sites, y=0.35)) + 
    ylim(c(-0.4, 0.9)) +
    geom_segment(aes(x = 0, y = -0.25, xend = pep_len, yend = -0.25),size =4, color = "black", lineend = "round") +
    scale_x_continuous(limits = c(-10,660), breaks = breaks_I_want) + 
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_rect(fill = "transparent",colour = NA),plot.background = element_rect(fill = "transparent",colour = NA), axis.text = element_blank(), axis.title = element_blank(), axis.ticks = element_blank())

    p3 <- p2 + add_shape(domain_list[[match(i, names(domain_list))]])
    
    if(i %in% names(GARD_sites_list) == TRUE){
        p4 <- p3 + 
        #add_gard_sites1(GARD_sites_list[[match(i, names(GARD_sites_list))]]) +
        add_gard_sites2(GARD_sites_list[[match(i, names(GARD_sites_list))]]) 
    }else{
        p4 <- p3
        next}

    #p5 <- p4 + ggtitle(i)

    print(p4)
    new_name <- as.character(paste(i, "Pos Selection_alt_sliding_window.pdf", collapse = ""))
    ggsave(new_name, width = 500, height = 100, units = "mm")

    #########################
    # Summary Table Section #
    #########################
    
    #ID's sites that were found at least once, by at least two tests (twice), and three tests (thrice)
    ;once <- temp_var2$temp_var2
    ;twice <- temp_var2$temp_var2[which(temp_var2$Freq >= 2)]
    ;thrice <- temp_var2$temp_var2[which(temp_var2$Freq == 3)]
    
    #Finds the intersect of sites under selection between different evo tests

    ;NS_sites <- unlist(temp_var[match("NS", file_name_list)])
    ;FUBAR_sites <- unlist(temp_var[match("FUBAR", file_name_list)])
    ;if("MEME" %in% file_name_list == TRUE){
        MEME_sites <- unlist(temp_var[match("MEME", file_name_list)])
    }
    else{
        MEME_sites <- unlist(temp_var[match("GARD", file_name_list)])
    }
    
    ;NS_FUBAR_overlap <- intersect(NS_sites, FUBAR_sites)
    ;NS_MEME_overlap <- intersect(NS_sites, MEME_sites)
    ;FUBAR_MEME_overlap <- intersect(FUBAR_sites, MEME_sites)

    #combines the data for a given CCM into a single line that is then bound to matrix All_Sites_Summary
    ;to_bind <- c(i, pep_len, paste(unlist(NS_sites), collapse = ", "), paste(unlist(FUBAR_sites), collapse = ", "), Gard_col, paste(unlist(MEME_sites), collapse = ", "),  paste(unlist(no_GARD_MEME), collapse = ", "), paste(unlist(once), collapse = ", "), paste(unlist(twice), collapse = ", "), paste(unlist(thrice), collapse = ", "), paste(unlist(NS_FUBAR_overlap), collapse = ", "), paste(unlist(NS_MEME_overlap), collapse = ", "), paste(unlist(FUBAR_MEME_overlap), collapse = ", "))
    ;to_bind[to_bind == ""] <- "None"
    ;All_Sites_Summary <- rbind(All_Sites_Summary, to_bind)
}

# Adds header to dataframe of compiled sites underselection matrix All_Sites_Summary, changes rownames and changes matrix to dataframe
header <- c("CEACAM", "Peptide Length", "PAML - Bayes Empirical Bayes", "HyPhy - FUBAR", "HyPhy - GARD Break Points", "HyPhy - MEME", "HyPhy - MEME without GARD", "All Sites 1x", "All Sites 2x", "All Sites 3x", "PAML/FUBAR Overlap", "PAML/MEME Overlap", "FUBAR/MEME Overlap")
colnames(All_Sites_Summary) <- header
row.names(All_Sites_Summary) <- seq(nrow(All_Sites_Summary))
All_Sites_Summary <- as.data.frame(All_Sites_Summary)

#writes dataframe to tab delim file
write.table(All_Sites_Summary, "Sites_Under_Selection_Summary.txt", sep = "\t", row.names = FALSE)

#############################################################################################################################
# After running the above code (without the above write.table function) just for CEACAM1 data also did the following to     #
# create a larger image of the CEACAM1 N-domain                                                                             #
# Used the CCM1_all_evo_data folder in same working directory with name changed to all_evo_data and name of folder with all #
# CCM datafiles changed to ALL_all_evo_data
# Highlighted the following:
# - sites under selection
# - residues important for bacterial adhesin binding
# - residues important for host protein binding 
##############################################################################################################################

#Functions

add_curation <- function(list_of_sites, for_name) {
    build_geom_cmd <- NULL
    for(item in list_of_sites){
        shape3 <<- data.frame(x = c( item-0.2, item+0.2, item), y = c(-0.1,-0.1,0.1))
        inner_fill <- ""
        outline_color <- ""
        if(names(for_name) == "host_binding"){
            inner_fill <- NA 
            outline_color <- "#008744" #green
        }
        if(names(for_name) == "adhesins"){
            inner_fill <- "#ffbf00" #yellowish
            outline_color <- "#ffbf00"
        }
        build_geom_cmd <-c(build_geom_cmd, geom_polygon(data = shape3, aes(x=x, y=y), fill = inner_fill, color = outline_color, size = 0.75))
    }
  return(build_geom_cmd)
}

#Gives the sites underselection in the N-domain
#107 is the end amino acid of the N-domain in CEACAM1 (from domain_list object)

temp_var2[which(as.numeric(as.character(temp_var2$temp_var2)) <= 107),]

Nshape <- data.frame(x = c(1,107,107,1), y=c(0.1,0.1,0.3,0.3))
Nshape2 <- data.frame(x = c((306/3), (306/3)-0.5, (306/3)+0.5), y = c(0.23, 0.6, 0.6))

currated_sites <- list()
currated_sites[["adhesins" ]]<- list(27,28,29,32,34,39,41,43,44,47,49,89,91,95,96)
currated_sites[["host_binding"]] <- list(32,40,42,43,44,47,89)

pA <- ggplot(data = subset(selection_freq_table, sites <= 107), aes(x=sites, y=0.35)) + 
    ylim(c(-0.25, 1.5)) +
    geom_segment(aes(x = 0, y = 0.2, xend = 50, yend = 0.2),size =4, color = "black", lineend = "round") +
    geom_segment(aes(x = 50, y = 0.2, xend = 2*107, yend = 0.2),size =4, color = "black", lineend = "square") + 
    geom_segment(aes(x = 6, y = 0.5, xend = 2*107, yend = 0.5), linetype = "solid", color = "#dadbda", alpha = 0.1) +
    geom_segment(aes(x = 6, y = .75, xend = 2*107, yend = .75), linetype = "longdash", color = "#dadbda", alpha = 0.1)+
    geom_segment(aes(x = 6, y = 1, xend = 2*107, yend = 1), linetype = "dashed", color = "#dadbda", alpha = 0.1) +
    annotate("text", x = 0, y = c(0.5, 0.75, 1), label = c("0.50", "0.75", "1.00"), color = "#dadbda", size = 3) +
    theme_void()

pB <- pA + geom_shape(data = Nshape, aes(x=x, y=y), fill = "#c04847",  radius = unit(2, "mm"))
pC <- pB + geom_bar(data = subset(selection_freq_table, sites <= 107), stat = "identity", aes(y=0.3), width = 0.5, alpha = 1, color = "white", fill = "white")
pD <- pC + geom_point(data = subset(subset(selection_freq_table, Freq %in% 2), sites <= 107), shape = 42, size = 10, color = "#4c4c4c", alpha = 1)
pE <- pD + geom_point(data = subset(subset(selection_freq_table, Freq %in% 3), sites <= 107), shape = 42, size = 10, color = "red", alpha = 1)
pF <- pE + geom_line(data = subset(pos_selection_slide, sites <= 107), aes(y = Freq), size = 0.75, linetype = "dotted", color = "#0084ff")
pG <- pF + geom_polygon(data = Nshape2, aes(x=x, y=y), color = "black", fill = NA)
pH <- pG + add_curation(currated_sites$adhesins, currated_sites[1])
pI <- pH + add_curation(currated_sites$host_binding, currated_sites[2])
pI

ggsave("CCM1_Nd_7-24-20_v1.pdf", width = 500, height = 100, units = "mm")

geom_segment(aes(x = 26.75, y = 0.05, xend = 27.25, yend = 0.05),size =1, color = "blue", lineend = "square")

